#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
# author:hengk
# contact: hengk@foxmail.com
# datetime:2019-10-23 12:41
"""
import os
from docx import Document
from docx.shared import Inches

from base2txt import BaseTrans

class Doc2Txt(BaseTrans):

    @staticmethod
    def tran2txt(src_filename,dst_filename):
        document = Document(src_filename)
        str = ""
        for paragraph in document.paragraphs:
            str+=paragraph.text
        str = str.replace('\n', '')
        str = "".join(str.split())
        with open(dst_filename, 'w') as f:
            f.write(str)

if __name__ == '__main__':
    excels_path = "/home/kangheng/generators/text_generators/data/words"
    corpus_path = "/home/kangheng/generators/text_generators/data/corpus"
    filenames = os.listdir(excels_path)
    trans = Doc2Txt()
    for filename in filenames:
        print(filename)
        postfix = filename[filename.rfind(".")+1:]
        assert postfix == "docx"
        src_name = os.path.join(excels_path,filename)
        dst_name = os.path.join(corpus_path,filename[0:filename.rfind(".")]+".txt")
        trans.tran2txt(src_name,dst_name)


